In [19]:
# auto reload libraries (you do need to re-import libraries if you make changes)
%load_ext autoreload
%autoreload 2

# base 
import pandas as pd
import numpy as np
from pprint import pprint
import os

# preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import TruncatedSVD
# variable importance
from sklearn.inspection import permutation_importance

# models
from sklearn.ensemble import GradientBoostingRegressor

# visualize
import matplotlib.pyplot as plt

# own defined functions/classes 
from preprocessing.preprocess_pipe import MakeLowerCase
from preprocessing.preprocess_pipe import HdResolutionCategorizer
from preprocessing.preprocess_pipe import StorageCategorizer
from preprocessing.preprocess_pipe import SsdCategorizer

from preprocessing.preprocess_pipe import print_missing
from preprocessing.preprocess_pipe import calculate_perf
from preprocessing.preprocess_pipe import custom_scoring_func_single_p

from modelling.weight_samples import weights_samples

from postprocessing.postprocessing import plot_predictions_results
from postprocessing.postprocessing import plot_residuals

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# global parameters
pd.set_option('display.max_columns', 500)
# style for plotting
plt.style.use('ggplot')
# interactive plotting
%matplotlib widget
# run grid search
RUN_GRID_SEARCH = True
# set working directory
uppath = lambda _path, n: os.sep.join(_path.split(os.sep)[:-n])
__file__ = 'C:\\Users\\Pieter-Jan\\Documents\\KuLeuven\\Semester2\\AA\\AAForB_Assignment2\\code\\PieterJan'
out = uppath(__file__, 2)
os.chdir(out)
print(os.getcwd())

C:\Users\Pieter-Jan\Documents\KuLeuven\Semester2\AA\AAForB_Assignment2


# Step 1: import data and split in train and validation set
The validation set is more our own kind of test set

In [3]:
# read in trainig and validation data
# use the same data split as we did in R
df_all_train = pd.read_csv("data\\train.csv", sep=',')
df_test = pd.read_csv("data\\test.csv", sep=',')

print(f'Dimensions of all training data {df_all_train.shape}')
print(f'Dimension test data {df_test.shape}')

Dimensions of all training data (510, 22)
Dimension test data (222, 20)


In [4]:
# split in training and validation set
df_train = df_all_train.sample(frac=0.75, random_state=0, replace=False)
df_val = df_all_train.drop(df_train.index)

# reset index, if you don't resit missing rows get inserted in the pipeline
# see: https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
df_train = df_train.reset_index().drop('index',axis=1)
df_val = df_val.reset_index().drop('index',axis=1)

# Step 2: define pipeline

In [5]:
#Numerical features to pass down the numerical pipeline 
numerical_features = ['screen_size' ,'pixels_x','pixels_y',
                      'ram', 'weight','ssd','storage']

#Categrical features to pass down the categorical pipeline 
categorical_features = ['brand','base_name', 'screen_surface','touchscreen',
                        'cpu','cpu_details','detachable_keyboard',
                        'discrete_gpu','gpu', 'os','os_details']

# define all unique features
features = np.unique(numerical_features).tolist() + np.unique(categorical_features).tolist()

# target variables
target = ['min_price','max_price']

In [7]:
# train
X_train = df_train[features]
y_train = df_train[target]

# validation (this is kind our own test set)
X_val  = df_val[features]
y_val = df_val[target]

# train_validation (this is all training data we have) for fitting the model
X_all_train = df_all_train[features]
y_all_train = df_all_train[target]

# test
X_test = df_test[features]

In [8]:
# you can add many more and 
# you can even define custom preprocessing steps like 'MakeLowerCase()'

# pipeline  uses only numerical features,
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    #('imputer', KNNImputer(n_neighbors=5, weights='uniform',metric='nan_euclidean')),
    ('imputer', IterativeImputer(missing_values=np.nan, random_state=0, n_nearest_features=None, sample_posterior=True)),
    #('transformation', PowerTransformer(method='yeo-johnson',standardize=False)),
     #Scale features using statistics that are robust to outliers.
    ('scaler', RobustScaler(with_centering=True, with_scaling=True, quantile_range=(10.0, 90.0)))]) 

# pipeline use only categorical features
categorical_transformer = Pipeline(steps=[ 
    ('lowercase', MakeLowerCase()), # lower cases all columns containing strings
    #('sd_category' ,SsdCategorizer(drop_original_feature=True)),
    #('storage_category', StorageCategorizer(drop_original_feature=True)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# add both preprocessing pipelines in one pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
# this is how the preporcessed pipeline looks like (just to have an idea)
pd.DataFrame(preprocessor.fit_transform(X_train)).shape

(382, 1)

# Step 3: add models to pipeline

Possible to fit multiple target variabels, so you **don't** need to fit a different models for min. price and max. price

### A) Training and parameter tuning

##### 1) Automatic tuning via grid search

I will only **do the tuning for the minimum price** and use the found parameters also for the maximum price

In [10]:
# define model: I just add some default parameters but you could
# also just write: RandomForestRegressor() since we will perform a grid search 
# to find good hyperparameter values
model_min_p = GradientBoostingRegressor(random_state=1)
model_max_p = GradientBoostingRegressor(random_state=1)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model_min_p.get_params())

# add to pipeline
# 1) min price
pipeline_min_p = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_min_p)])


# add transformation on the target variable, by default power transformation 
# also performs standardization after performing the power transformation
# and back transform to the original space when outputting predictions 

# 1) min price
transformer_target = PowerTransformer(method='yeo-johnson',standardize=False)
scale_target = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(10.0, 90.0))
pipeline_y = Pipeline(memory=None,
              steps=[('transformer', transformer_target),
                     ('scaler',scale_target)])

pipeline_min_p_update = TransformedTargetRegressor(regressor=pipeline_min_p, 
                                         transformer=pipeline_y)

Parameters currently in use:

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': 1,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [11]:
# loss function
loss_function = ['ls', 'lad']

learning_rate = [0.1, 0.01, 0.005, 0.001]

# The number of boosting stages to perform. 
# Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 5000, num = 20)]

# The number of features to consider when looking for the best split:
# - If “auto”, then max_features=n_features.
# - If “sqrt”, then max_features=sqrt(n_features).
# - If “log2”, then max_features=log2(n_features).
# - If None, then max_features=n_features
max_features = ['auto', 'sqrt','log2']

# maximum depth of the individual regression estimators
max_depth = [int(x) for x in np.linspace(20, 80, num = 10)]

n_iter_no_change = [10,20,50]

# The function to measure the quality of a split
criterion = ['friedman_mse']

# Minimum number of samples required to split a node
# - If int, then consider min_samples_split as the minimum number.
# - If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.
min_samples_split = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7 , 0.9]
# Minimum number of samples required at each leaf node

# - If int, then consider min_samples_leaf as the minimum number.
# - If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.
min_samples_leaf = [0.001, 0.01, 0.1, 0.3, 0.5]

# Create the random grid
random_grid = {
   'regressor__regressor__n_estimators': n_estimators,
   'regressor__regressor__learning_rate': learning_rate,
   'regressor__regressor__max_features': max_features,
   'regressor__regressor__max_depth': max_depth,
   'regressor__regressor__criterion': criterion,
   'regressor__regressor__min_samples_split': min_samples_split,
   'regressor__regressor__min_samples_leaf': min_samples_leaf,
   'regressor__regressor__n_iter_no_change': n_iter_no_change,
   'regressor__regressor__loss': loss_function 
}

In [37]:
# define random search (and narrow down time grid search)

if RUN_GRID_SEARCH:
    min_p_random_search = RandomizedSearchCV(
       estimator = pipeline_min_p_update, 
       param_distributions = random_grid, n_iter = 40,
       cv = 5, verbose=2, random_state=1, n_jobs = -1, refit=True,
       scoring=make_scorer(custom_scoring_func_single_p, greater_is_better=False)
    )

    # run grid search and refit with best hyper parameters
    weights_train_min_p =  weights_samples(df=y_train.iloc[:,0], order=0, plot_weights=False)
    min_p_random_search.fit(X_train, y_train.iloc[:,0])  
    print(min_p_random_search.best_params_)    
    print(min_p_random_search.best_score_)


# have look at the best hyperparameters and their respective performance (maybe also look at the sd)
pd.DataFrame(min_p_random_search.cv_results_).sort_values(
        by=['mean_test_score'],ascending=False).head(5)

Sum weights: 1.0
Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.4min finished


[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.5s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   5.6s
{'regressor__regressor__n_iter_no_change': 50, 'regressor__regressor__n_estimators': 4526, 'regressor__regressor__min_samples_split': 0.1, 'regressor__regressor__min_samples_leaf': 0.001, 'regressor__regressor__max_features': 'sqrt', 'regressor__regressor__max_depth': 80, 'regressor__regressor__loss': 'ls', 'regressor__regressor__learning_rate': 0.001, 'regressor__regressor__criterion': 'friedman_mse'}
-154.27339942002402


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__regressor__n_iter_no_change,param_regressor__regressor__n_estimators,param_regressor__regressor__min_samples_split,param_regressor__regressor__min_samples_leaf,param_regressor__regressor__max_features,param_regressor__regressor__max_depth,param_regressor__regressor__loss,param_regressor__regressor__learning_rate,param_regressor__regressor__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
14,8.187409,0.977632,0.190211,0.017478,50,4526,0.1,0.001,sqrt,80,ls,0.001,friedman_mse,"{'regressor__regressor__n_iter_no_change': 50,...",-172.543212,-137.008171,-182.897837,-158.899036,-120.018741,-154.273399,23.004394,1
10,5.216782,0.973916,0.046182,0.012869,50,1921,0.3,0.01,sqrt,60,lad,0.01,friedman_mse,"{'regressor__regressor__n_iter_no_change': 50,...",-178.471914,-142.201003,-174.516339,-157.313875,-120.460095,-154.592645,21.415704,2
30,3.48662,0.891358,0.105025,0.028988,20,3815,0.01,0.001,log2,60,ls,0.005,friedman_mse,"{'regressor__regressor__n_iter_no_change': 20,...",-183.715091,-143.18889,-179.686058,-168.444872,-120.421191,-159.09122,23.94255,3
31,5.3964,1.325245,0.032405,0.012875,20,3815,0.3,0.01,auto,53,lad,0.01,friedman_mse,"{'regressor__regressor__n_iter_no_change': 20,...",-191.250734,-138.720086,-189.487549,-158.105587,-125.633907,-160.639573,26.387613,4
28,4.791781,1.123877,0.040984,0.013858,20,2157,0.3,0.01,auto,80,lad,0.01,friedman_mse,"{'regressor__regressor__n_iter_no_change': 20,...",-191.250734,-138.720086,-189.487549,-158.105587,-125.633907,-160.639573,26.387613,4


##### 2) Manual parameters tuning

In [53]:
# train your final model on all data with best parameters 

# 1) min price
model_min_p = GradientBoostingRegressor(
     criterion='friedman_mse',
     n_estimators=4526,
     learning_rate=0.001,
     loss = 'ls',
     max_depth=80,
     max_features='sqrt',
     n_iter_no_change = 50,
     min_samples_split=0.1,
     min_samples_leaf=0.001,
     random_state=0
)

# 2)  min price
model_max_p = GradientBoostingRegressor(
     criterion='friedman_mse',
     n_estimators=4526, 
     learning_rate=0.001,
     loss = 'ls',
     max_depth=80,
     max_features='sqrt',
     n_iter_no_change = 50,
     min_samples_split=0.1,
     min_samples_leaf=0.001,
     random_state=0
)


# add to pipeline

# 1) min price
pipeline_min_p = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_min_p)],
              verbose=True)

# 2) min price
pipeline_max_p = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_max_p)],
              verbose=True)

# again add transformer for target variable

# 1) min price
pipeline_min_p_update = TransformedTargetRegressor(regressor=pipeline_min_p, 
                                         transformer=pipeline_y)
# 2) max price
pipeline_max_p_update = TransformedTargetRegressor(regressor=pipeline_max_p, 
                                         transformer=pipeline_y)

# fit final model on all training data we have at hand

# 1) min price
weights_train_min_p = weights_samples(y_train.iloc[:,0], order=0)
pipeline_min_p_update.fit(X_train, y_train.iloc[:,0])

# 2) max price
weights_train_max_p = weights_samples(y_train.iloc[:,1], order=0)
pipeline_max_p_update.fit(X_train, y_train.iloc[:,1])

Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   3.0s
Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   3.2s


TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
                           regressor=Pipeline(memory=None,
                                              steps=[('preprocessor',
                                                      ColumnTransformer(n_jobs=None,
                                                                        remainder='drop',
                                                                        sparse_threshold=0.3,
                                                                        transformer_weights=None,
                                                                        transformers=[('num',
                                                                                       Pipeline(memory=None,
                                                                                                steps=[('imputer',
                                                                                                        Itera

In [55]:
# performance on data where the model was fit one (should be very low)
pred_train_min_p = pipeline_min_p_update.predict(X_train)
pred_train_max_p = pipeline_max_p_update.predict(X_train)

# calculate performance
pred_train = pd.DataFrame([pred_train_min_p,pred_train_max_p]).T
calculate_perf(y_train, pred_train)

{'minimum price': 77.36006022660138,
 'maximum price': 78.52107158666128,
 'total error': 155.88113181326264}

##### Performance on validation data

In [14]:
# performance on validation data
pred_val_min_p = pipeline_min_p_update.predict(X_val)
pred_val_max_p = pipeline_max_p_update.predict(X_val)

# calculate performance 
pred_val = pd.DataFrame([pred_val_min_p,pred_val_max_p]).T
calculate_perf(y_val, pred_val)

{'minimum price': 117.93621711781617,
 'maximum price': 120.70283932821516,
 'total error': 238.63905644603133}

In [15]:
# save predictions on validation data
# submission format
submission_format_validation = pd.DataFrame.from_dict(
 {'ID':df_val['id'].values,
 'MIN':pred_val_min_p,
 'MAX':pred_val_max_p}).set_index('ID')

# write to csv
submission_format_validation.to_csv('code\\PieterJan\\python\\output\\validation\\gradient_boosting.csv' ,
                            header=True, index=True, sep=',')

### C) Post processing

In [16]:
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

# fitted against true predictions minimum price
plot_predictions_results(ax=axs[0], 
                        y_true=y_val.iloc[:,0], 
                        y_pred=pred_val_min_p, 
                        title="Boosting Min. Price Test Set", 
                        log_scale=True)

# fitted against true predictions maximum price
plot_predictions_results(ax=axs[1], 
                        y_true=y_val.iloc[:,1], 
                        y_pred=pred_val_max_p, 
                        title="Boosting Max. Price Test Set", 
                        log_scale=True)

# residuals minimum price
plot_residuals(ax=axs[2], 
               y_true=y_val.iloc[:,0], 
               y_pred=pred_val_min_p, 
               title="", 
               log_scale=False,
               order=1)

# residuals maximum price
plot_residuals(ax=axs[3], 
               y_true=y_val.iloc[:,1], 
               y_pred=pred_val_max_p, 
               title="", 
               log_scale=False,
               order=1)
fig.tight_layout()
plt.savefig('code\\PieterJan\\python\\output\\figures\\gradient_boosting\\fig1.png')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [17]:
plt.figure(figsize=(8,4))
plt.plot(pred_val_max_p - pred_val_min_p, label="Predictions Test", linestyle="--")
plt.plot(y_val.iloc[:,1] - y_val.iloc[:,0], label="Truth Test", linestyle=':')
plt.xlabel("Observations")
plt.ylabel("Max. Price - Min. Price")
plt.legend(loc="upper right");
plt.savefig('code\\PieterJan\\python\\output\\figures\\gradient_boosting\\fig2.png')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [49]:
vip_val_min_p = permutation_importance(pipeline_min_p_update, X_val, y_val.iloc[:,0], n_repeats=10,
                                random_state=1, n_jobs=3)

vip_val_max_p = permutation_importance(pipeline_min_p_update, X_val, y_val.iloc[:,1], n_repeats=10,
                                random_state=1, n_jobs=3)

sorted_idx_min_p = vip_val_min_p.importances_mean.argsort()
sorted_idx_max_p = vip_val_max_p.importances_mean.argsort()

In [48]:
fig, ax = plt.subplots(1, 2, figsize=(9, 4))
ax = np.ravel(ax)
# minimum price
ax[0].boxplot(vip_val_min_p.importances[sorted_idx_min_p].T,
           vert=False, labels=X_val.columns[sorted_idx_min_p])
ax[0].set_title("Permutation Importances: Min. Price (Test set)", fontsize=9)
ax[0].xaxis.set_tick_params(labelsize=8)
ax[0].yaxis.set_tick_params(labelsize=8)

# maximum price
ax[1].boxplot(vip_val_min_p.importances[sorted_idx_max_p].T,
           vert=False, labels=X_val.columns[sorted_idx_max_p])
ax[1].set_title("Permutation Importances: Max. Price (Test set)", fontsize=9)
ax[1].xaxis.set_tick_params(labelsize=8)
ax[1].yaxis.set_tick_params(labelsize=8)
fig.tight_layout()
plt.show()
plt.savefig('code\\PieterJan\\python\\output\\figures\\gradient_boosting\\fig3.png')



Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### D) Predictions test data

In [50]:
# train your final model on all data with best parameters 

# 1) min price
model_min_p_final = GradientBoostingRegressor(
     criterion='friedman_mse',
     n_estimators=4526, 
     learning_rate=0.001,
     loss = 'ls',
     max_depth=80,
     max_features='sqrt',
     n_iter_no_change = 50,
     min_samples_split=0.1,
     min_samples_leaf=0.001,
     random_state=0
)

# 2)  min price
model_max_p_final = GradientBoostingRegressor(
     criterion='friedman_mse',
     n_estimators=4526, 
     learning_rate=0.001,
     loss = 'ls',
     max_depth=80,
     max_features='sqrt',
     n_iter_no_change = 50,
     min_samples_split=0.1,
     min_samples_leaf=0.001,
     random_state=0
)


# add to pipeline

# 1) min price
pipeline_min_p_final = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_min_p_final)],
              verbose=True)

# 2) min price
pipeline_max_p_final = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_max_p_final)],
              verbose=True)

# again add transformer for target variable

# 1) min price
pipeline_min_p_final = TransformedTargetRegressor(regressor=pipeline_min_p_final, 
                                         transformer=pipeline_y)
# 2) max price
pipeline_max_p_final = TransformedTargetRegressor(regressor=pipeline_max_p_final, 
                                         transformer=pipeline_y)

# fit final model on all training data we have at hand

# 1) min price
weights_all_train_min_p = weights_samples(y_all_train.iloc[:,0], order=2)
pipeline_min_p_final.fit(X_all_train, y_all_train.iloc[:,0])

# 2) max price
weights_all_train_max_p = weights_samples(y_all_train.iloc[:,1], order=2)
pipeline_max_p_final.fit(X_all_train, y_all_train.iloc[:,1])

Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.2s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   4.5s
Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.2s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   4.7s


TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
                           regressor=Pipeline(memory=None,
                                              steps=[('preprocessor',
                                                      ColumnTransformer(n_jobs=None,
                                                                        remainder='drop',
                                                                        sparse_threshold=0.3,
                                                                        transformer_weights=None,
                                                                        transformers=[('num',
                                                                                       Pipeline(memory=None,
                                                                                                steps=[('imputer',
                                                                                                        Itera

In [51]:
# performance on all data where the model was fit one (should be very low)
pred_all_train_min_p = pipeline_min_p_final.predict(X_all_train)
pred_all_train_max_p = pipeline_max_p_final.predict(X_all_train)

# calculate performance
pred_all_train = pd.DataFrame([pred_all_train_min_p, pred_all_train_max_p]).T
calculate_perf(y_all_train, pred_all_train)

{'minimum price': 77.99420904859433,
 'maximum price': 79.89283497807546,
 'total error': 157.8870440266698}

In [52]:
# make predictions on test data
pred_test_min_p = pipeline_min_p_final.predict(X_test)
pred_test_max_p = pipeline_max_p_final.predict(X_test)

# submission format
submission_format = pd.DataFrame.from_dict(
 {'ID':df_test['id'].values,
 'MIN':pred_test_min_p,
 'MAX':pred_test_max_p}).set_index('ID')

# write to csv
submission_format.to_csv('code\\PieterJan\\python\\output\\submission\\gradient_boosting.csv' ,
                            header=True, index=True, sep=',')