In [1]:
#imports of package used in the 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [2]:
X_train=pd.read_csv("X_train.csv",sep=';',index_col="Id") #_sqprice for price per square feet
y_train=pd.read_csv("y_train.csv",sep=';',index_col="Id")
#X_test=pd.read_csv("X_test.csv",sep=';',index_col="Id")
X_train_split1, X_test, y_train_split1, y_test = train_test_split(X_train,y_train, test_size=0.2,random_state=42)

In [3]:
#a function to evaluate the fit (rmse) of the prediciton
from sklearn.metrics import mean_squared_error
def evaluate_fit(y_test,predictions):
    rmse=np.sqrt(mean_squared_error(y_test,predictions))
    return rmse

In [5]:
#necessary imports: decisiontreeregressor, np,pandas, train_test_split
def ensemble_fct(random_state=42,n=20,test_size=0.2,max_features=None,max_depth=6,report_mse=False):
    """This function draws subsamples of the training set, trains a tree to every subsample and generates an
    ensembles of these trees
    
    +++++parameters+++++
    random_state: [None or any integer]
    n: [int] specifies how many subsamples/trees should be used
    test_size: [float (0,1)] - specifies the proportion of the training data should be used as subsample
    max_features:[int or float(0,1)] specifies how many features should be considered at splits. this is used in random forest methods 
    to decorrelate trees
    max_depth: [int] - specifies the maximum depth of the single trees
    report_mse: reports the mse of every single predictors and every ensemble
    """
    
    # drawing randomly 'n' random states from (0,2000)
    states=[]
    while len(states)<n:
        candidate=np.random.randint(0,2000)
        if candidate not in states:
            states.append(candidate)
    print("generated "+str(n)+" random states")
    
    #initialize the tree with the prespecified parameters and define a dataframe to store the forecasts of every tree
    dtree=DecisionTreeRegressor(random_state=random_state,max_depth=max_depth,max_features=max_features) #max_features=80 
    all_predictions=pd.DataFrame(index=X_test.index)
    
    #drawing n subsamples each time from tull training dataset, fitting a tree to the subsample
    #    and use the tree to make prediction for every tree
    for i in range(n):
        X_train_split, X_eval, y_train_split, y_eval = train_test_split(X_train_split1,y_train_split1, test_size=test_size,random_state=states[i])
        dtree.fit(X_train_split, y_train_split)
        all_predictions[str(i)]=dtree.predict(X_test)
    
    #evaluate the fit of single tree predictions, returning contains all output variables for further use
    returning={}
    mse_tree=[]
    for i in range(len(states)):
        mse_t=evaluate_fit(y_test,all_predictions[str(i)])
        mse_tree.append(mse_t)
        if report_mse==True:
            print(str(i)+". Tree: "+str(mse_t))
    returning["mse_tree"]=mse_tree
    print("generated "+str(n)+" predictions")
    
    # combine the individual predictions to n ensembles (first ensemble is just single tree)
    ensemble=pd.DataFrame(index=X_test.index)
    labels=list(all_predictions.columns.values)
    ensemble["Ensemble0"]=(all_predictions["0"])
    for i in range(1,len(labels)):
        ensemble["Ensemble"+labels[i]]=ensemble["Ensemble"+str(i-1)]+all_predictions[labels[i]]
    
    #divide the ensembles by the number of trees used for the ensemble predictions
    ensemble_labels=list(ensemble.columns.values)
    counter=1
    for i in ensemble_labels:
        ensemble[i]=ensemble[i]/counter
        counter+=1
    print("generated "+str(n)+" ensembles")
    
    #evaluate the fit of ensembles
    mse_ensemble=[]
    for i in ensemble.columns.values:
        mse_e=evaluate_fit(y_test,ensemble[str(i)])
        if report_mse==True:
            print(i+": "+str(mse_e))
        mse_ensemble.append(mse_e)
        
    #return a dictionary with all the relevant outputs for further use    
    returning["mse_ensemble"]=mse_ensemble
    returning["trees"]=all_predictions
    returning["ensemble"]=ensemble
    returning["states"]=states
    
    
    return returning