In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import svm
import lightgbm as lgb
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy import stats
import xgboost as xgb
from scipy.stats import norm
#from pyglmnet import GLM # Marco: need to understand how to install this 
from sklearn.preprocessing import StandardScaler
from subprocess import call
from sklearn.cross_validation import KFold

from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')

# Create Ensemble class

In [5]:
class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models,seed_value):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models
        self.seed_value = seed_value
        
    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=self.seed_value))
        #folds = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.seed_value)
        
        print("folds=",folds)
        
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        
        for i, reg in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], len(folds)))
            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                # y_holdout = y[test_idx]
                reg.fit(X_train, y_train)
                y_pred = reg.predict(X_holdout)[:]
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = reg.predict(T)[:]
            S_test[:, i] = S_test_i.mean(1)
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)[:]
        return y_pred


# Read in the dataset

In [6]:
    
seed = 2017
nfold = 5
bmodels = ["elasticnet","et","lgb","rf","xgb"]
    
train = pd.read_csv("../../data/X_train_v2.csv")
y = train['SalePrice']
X = train.loc[:,'MSSubClass':'SaleCondition_Partial']
    
test = pd.read_csv("../../data/X_test_v2.csv")
id = test["Id"]
T = test.loc[:,'MSSubClass':'SaleCondition_Partial']
    
#Set the base models

base_models_name = []
for j in range(len(bmodels)):
        modelname = ("../../models/single/model_" + bmodels[j] + ".py")
        print(modelname)
        base_models_name.append(modelname)

print(base_models_name)
base_models = []
    
for i, bm in enumerate(base_models_name):
        model = !grep "model =" {bm}
        model = model[0]
        model = model[12:]
        model = eval(model)
        base_models.append(model)
#print(model)
#print(base_models) 

# Set the stacker model

xgb = xgb.XGBRegressor(n_estimators=250,learning_rate=0.1,max_depth=4,min_child_weight=1,objective='reg:linear')
# Call stacking
    
ens = Ensemble(n_folds=nfold, stacker=xgb,base_models=base_models,seed_value=seed)

../../models/single/model_elasticnet.py
../../models/single/model_et.py
../../models/single/model_lgb.py
../../models/single/model_rf.py
../../models/single/model_xgb.py
['../../models/single/model_elasticnet.py', '../../models/single/model_et.py', '../../models/single/model_lgb.py', '../../models/single/model_rf.py', '../../models/single/model_xgb.py']


In [7]:
    
results = ens.fit_predict(X,y,T)
print("results=",results)

folds= [(array([   0,    1,    2, ..., 1436, 1437, 1438]), array([  17,   20,   25,   41,   54,   55,   56,   57,   61,   64,   93,
        100,  102,  114,  115,  116,  117,  119,  120,  123,  128,  160,
        166,  168,  179,  183,  188,  189,  191,  192,  196,  213,  216,
        222,  230,  232,  235,  238,  239,  243,  257,  261,  262,  263,
        270,  271,  273,  277,  282,  296,  299,  300,  313,  317,  319,
        331,  332,  333,  350,  353,  356,  358,  364,  371,  374,  375,
        377,  379,  383,  389,  391,  397,  401,  411,  413,  414,  416,
        421,  423,  426,  433,  434,  435,  436,  441,  443,  449,  451,
        457,  460,  461,  462,  463,  464,  465,  478,  480,  481,  484,
        487,  492,  497,  499,  505,  510,  522,  530,  534,  536,  539,
        544,  546,  554,  555,  556,  557,  570,  571,  575,  577,  601,
        609,  617,  623,  636,  639,  642,  649,  650,  651,  652,  657,
        668,  679,  681,  684,  692,  698,  701,  714,  717,  718

In [8]:
results = np.expm1(results)

In [9]:
print("results=",results)
results = pd.DataFrame(results)
results = pd.concat([id,results], axis=1)
results.columns =["Id","SalePrice"]
print("results=",results)
results.to_csv("./ensembled_results.csv", index = False)

results= [  97615.140625  154915.0625    178057.359375 ...,  165100.640625
  123039.546875  226210.46875 ]
results=         Id      SalePrice
0     1461   97615.140625
1     1462  154915.062500
2     1463  178057.359375
3     1464  193808.718750
4     1465  187840.078125
5     1466  169516.000000
6     1467  180018.859375
7     1468  167481.328125
8     1469  181983.218750
9     1470  119354.906250
10    1471  183439.406250
11    1472   94102.484375
12    1473   92549.671875
13    1474  149598.937500
14    1475  111228.656250
15    1476  390285.250000
16    1477  250941.859375
17    1478  308475.375000
18    1479  266444.406250
19    1480  596359.250000
20    1481  337628.968750
21    1482  211879.906250
22    1483  174220.343750
23    1484  167095.250000
24    1485  196084.109375
25    1486  195175.343750
26    1487  335670.875000
27    1488  233510.750000
28    1489  203888.250000
29    1490  234890.843750
...    ...            ...
1429  2890   70925.218750
1430  2891  135507.375000


In [10]:
results

Unnamed: 0,Id,SalePrice
0,1461,97615.140625
1,1462,154915.062500
2,1463,178057.359375
3,1464,193808.718750
4,1465,187840.078125
5,1466,169516.000000
6,1467,180018.859375
7,1468,167481.328125
8,1469,181983.218750
9,1470,119354.906250


In [11]:
id

0       1461
1       1462
2       1463
3       1464
4       1465
5       1466
6       1467
7       1468
8       1469
9       1470
10      1471
11      1472
12      1473
13      1474
14      1475
15      1476
16      1477
17      1478
18      1479
19      1480
20      1481
21      1482
22      1483
23      1484
24      1485
25      1486
26      1487
27      1488
28      1489
29      1490
        ... 
1429    2890
1430    2891
1431    2892
1432    2893
1433    2894
1434    2895
1435    2896
1436    2897
1437    2898
1438    2899
1439    2900
1440    2901
1441    2902
1442    2903
1443    2904
1444    2905
1445    2906
1446    2907
1447    2908
1448    2909
1449    2910
1450    2911
1451    2912
1452    2913
1453    2914
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, dtype: int64